Skip to content

Commit

Permalink
Add support for KCP remediation during cluster provisioning
Browse files Browse the repository at this point in the history
  • Loading branch information
fabriziopandini committed Jan 21, 2023
1 parent 7527262 commit d461439
Show file tree
Hide file tree
Showing 22 changed files with 2,315 additions and 314 deletions.
7 changes: 7 additions & 0 deletions controlplane/kubeadm/api/v1alpha3/conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,13 @@ func (src *KubeadmControlPlane) ConvertTo(dstRaw conversion.Hub) error {

dst.Spec.RolloutBefore = restored.Spec.RolloutBefore

if restored.Spec.RemediationStrategy != nil {
dst.Spec.RemediationStrategy = restored.Spec.RemediationStrategy
}
if restored.Status.LastRemediation != nil {
dst.Status.LastRemediation = restored.Status.LastRemediation
}

return nil
}

Expand Down
2 changes: 2 additions & 0 deletions controlplane/kubeadm/api/v1alpha3/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions controlplane/kubeadm/api/v1alpha4/conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,13 @@ func (src *KubeadmControlPlane) ConvertTo(dstRaw conversion.Hub) error {
dst.Spec.RolloutBefore = restored.Spec.RolloutBefore
dst.Spec.MachineTemplate.NodeVolumeDetachTimeout = restored.Spec.MachineTemplate.NodeVolumeDetachTimeout

if restored.Spec.RemediationStrategy != nil {
dst.Spec.RemediationStrategy = restored.Spec.RemediationStrategy
}
if restored.Status.LastRemediation != nil {
dst.Status.LastRemediation = restored.Status.LastRemediation
}

return nil
}

Expand Down Expand Up @@ -234,5 +241,11 @@ func Convert_v1beta1_KubeadmControlPlaneMachineTemplate_To_v1alpha4_KubeadmContr

func Convert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha4_KubeadmControlPlaneSpec(in *controlplanev1.KubeadmControlPlaneSpec, out *KubeadmControlPlaneSpec, scope apiconversion.Scope) error {
// .RolloutBefore was added in v1beta1.
// .RemediationStrategy was added in v1beta1.
return autoConvert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha4_KubeadmControlPlaneSpec(in, out, scope)
}

func Convert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in *controlplanev1.KubeadmControlPlaneStatus, out *KubeadmControlPlaneStatus, scope apiconversion.Scope) error {
// .LastRemediation was added in v1beta1.
return autoConvert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in, out, scope)
}
17 changes: 7 additions & 10 deletions controlplane/kubeadm/api/v1alpha4/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

76 changes: 76 additions & 0 deletions controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.
package v1beta1

import (
"time"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
Expand Down Expand Up @@ -49,6 +51,23 @@ const (
// KubeadmClusterConfigurationAnnotation is a machine annotation that stores the json-marshalled string of KCP ClusterConfiguration.
// This annotation is used to detect any changes in ClusterConfiguration and trigger machine rollout in KCP.
KubeadmClusterConfigurationAnnotation = "controlplane.cluster.x-k8s.io/kubeadm-cluster-configuration"

// RemediatingInProgressAnnotation is used to keep track that a KCP remediation is in progress, and more
// specifically it tracks that the system is in between having deleted an unhealthy machine and recreating its replacement.
// NOTE: if something external to CAPI removes this annotation the system cannot detect the above situation; this can lead to
// failures in updating remediation retry or remediation count (both counters restart from zero).
RemediatingInProgressAnnotation = "kubeadm.controlplane.cluster.x-k8s.io/remediation-in-progress"

// MachineRemediationForAnnotation is used to link a new machine to the unhealthy machine it is replacing;
// please note that in case of retry, when also the remediating machine fails, the system keep tracks
// the first machine of the sequence only.
// NOTE: if something external to CAPI removes this annotation the system this can lead to
// failures in updating remediation retry (the counter restarts from zero).
MachineRemediationForAnnotation = "kubeadm.controlplane.cluster.x-k8s.io/remediation-for"

// DefaultMinHealthyPeriod defines the default minimum number of seconds before we consider a remediation on a
// machine unrelated from the previous remediation.
DefaultMinHealthyPeriod = 1 * time.Hour
)

// KubeadmControlPlaneSpec defines the desired state of KubeadmControlPlane.
Expand Down Expand Up @@ -91,6 +110,10 @@ type KubeadmControlPlaneSpec struct {
// +optional
// +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1}}
RolloutStrategy *RolloutStrategy `json:"rolloutStrategy,omitempty"`

// The RemediationStrategy that controls how control plane machines remediation happens.
// +optional
RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"`
}

// KubeadmControlPlaneMachineTemplate defines the template for Machines
Expand Down Expand Up @@ -158,6 +181,40 @@ type RollingUpdate struct {
MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty"`
}

// RemediationStrategy allows to define how control plane machines remediation happens.
type RemediationStrategy struct {
// MaxRetry is the Max number of retry while attempting to remediate an unhealthy machine.
// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
// For example, given a control plane with three machines M1, M2, M3:
//
// M1 become unhealthy; remediation happens, and M1bis is created as a replacement.
// If M1-1 (replacement of M1) have problems while bootstrapping it will become unhealthy, and then be
// remediated; such operation is considered a retry, remediation-retry #1.
// If M1-2 (replacement of M1-2) becomes unhealthy, remediation-retry #2 will happen, etc.
//
// A retry could happen only after RetryPeriod from the previous retry.
// If a machine is marked as unhealthy after MinHealthyPeriod from the previous remediation expired,
// this is not considered anymore a retry because the new issue is assumed unrelated from the previous one.
//
// If not set, infinite retry will be attempted.
// +optional
MaxRetry *int32 `json:"maxRetry,omitempty"`

// RetryPeriod is the duration that KCP should wait before remediating a machine being created as a replacement
// for an unhealthy machine (a retry).
//
// If not set, a retry will happen immediately.
// +optional
RetryPeriod metav1.Duration `json:"retryDelaySeconds,omitempty"`

// MinHealthyPeriod defines the duration after which KCP will consider any failure to a machine unrelated
// from the previous one, and thus a new remediation is not considered a retry anymore.
//
// If not set, this value is defaulted to 1h.
// +optional
MinHealthyPeriod *metav1.Duration `json:"minHealthySeconds,omitempty"`
}

// KubeadmControlPlaneStatus defines the observed state of KubeadmControlPlane.
type KubeadmControlPlaneStatus struct {
// Selector is the label selector in string format to avoid introspection
Expand Down Expand Up @@ -223,6 +280,25 @@ type KubeadmControlPlaneStatus struct {
// Conditions defines current service state of the KubeadmControlPlane.
// +optional
Conditions clusterv1.Conditions `json:"conditions,omitempty"`

// LastRemediation stores info about last remediation performed.
// +optional
LastRemediation *LastRemediationStatus `json:"lastRemediation,omitempty"`
}

// LastRemediationStatus stores info about last remediation performed.
// NOTE: if for any reason information about last remediation are lost, RetryCount is going to restarts from 0 and thus
// more remediation than expected might happen.
type LastRemediationStatus struct {
// Machine is the machine name of the latest machine being remediated.
Machine string `json:"machine"`

// Timestamp is RFC 3339 date and time at which last remediation happened.
Timestamp metav1.Timestamp `json:"timestamp"`

// RetryCount used to keep track of remediation retry for the last remediated machine.
// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
RetryCount int32 `json:"retryCount"`
}

// +kubebuilder:object:root=true
Expand Down
52 changes: 52 additions & 0 deletions controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit d461439

Please sign in to comment.