Skip to content

Commit

Permalink
controller: update condition reason when upgrade job crashes
Browse files Browse the repository at this point in the history
the current code will only update the condition reason when the job
fails, but if the job crashes the reconcile loop will stay stuck in the
migration in progress state and never recreate the upgrade job.
this fixes the problem by always updating the condition reason to
migrations failed after having checked that the job is neither active
nor succeeded.

relates to PROJQUAY-1812
  • Loading branch information
flavianmissi committed Jan 4, 2022
1 parent 550912e commit 4351f02
Showing 1 changed file with 24 additions and 22 deletions.
46 changes: 24 additions & 22 deletions controllers/quay/quayregistry_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,34 +188,36 @@ func (r *QuayRegistryReconciler) Reconcile(ctx context.Context, req ctrl.Request
return ctrl.Result{RequeueAfter: time.Second}, nil
}

// when the job fails, the next reconciliation to run will think
// that migrations are currently not running, since we change
// the condition reason to v1.ConditionReasonMigrationsFailed.
log.Info("upgrade job failed or crashed.", "upgrade-job-status", upgradeJob.Status)

// a kube job can be Active, Succeeded or Failed.
// we explicitly check for Active and Succeeded above. if no pods
// are in either state it means the job either failed or crashed.
// crashed pods are not marked as Failed, so we don't check.
//
// when the job crashes or fails, the next reconciliation to run
// will think that migrations are currently not running, since
// we change the condition reason to v1.ConditionReasonMigrationsFailed.
// this doesn't necessarily describe reality, because kube will
// retry failed jobs for us, but it's desired behaviour because
// when the migration job fails due to misconfiguration, then the
// reconcile function should be allowed to proceed.
if upgradeJob.Status.Failed == 1 {
msg := "failed to run migrations"
for _, cond := range upgradeJob.Status.Conditions {
if cond.Type == batchv1.JobFailed && cond.Status == corev1.ConditionTrue {
msg = cond.Message
break
}
msg := "failed to run migrations"
for _, cond := range upgradeJob.Status.Conditions {
if cond.Type == batchv1.JobFailed && cond.Status == corev1.ConditionTrue {
msg = cond.Message
break
}
updatedQuay, err = r.updateWithCondition(
updatedQuay,
v1.ConditionComponentsCreated,
metav1.ConditionFalse,
v1.ConditionReasonMigrationsFailed,
msg)
if err != nil {
log.Error(err, "failed to update `conditions` of `QuayRegistry`")
}
return ctrl.Result{RequeueAfter: time.Second}, nil
}

log.Info("Unexpected job status.", fmt.Sprintf("%+v", upgradeJob.Status))
updatedQuay, err = r.updateWithCondition(
updatedQuay,
v1.ConditionComponentsCreated,
metav1.ConditionFalse,
v1.ConditionReasonMigrationsFailed,
msg)
if err != nil {
log.Error(err, "failed to update `conditions` of `QuayRegistry`")
}
return ctrl.Result{RequeueAfter: time.Second}, nil
}

Expand Down

0 comments on commit 4351f02

Please sign in to comment.