From b1a0d828ceed898e1e83c5622c181601957abade Mon Sep 17 00:00:00 2001 From: Rahul Ganesh <31204974+rahulbabu95@users.noreply.github.com> Date: Thu, 5 Sep 2024 16:59:32 -0700 Subject: [PATCH] Account for machine healthiness during reconciliation (#63) Signed-off-by: Rahul Ganesh --- controllers/status.go | 12 +++++++--- controllers/status_test.go | 49 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/controllers/status.go b/controllers/status.go index 93b0874..4e3cd83 100644 --- a/controllers/status.go +++ b/controllers/status.go @@ -28,14 +28,20 @@ func (r *EtcdadmClusterReconciler) updateStatus(ctx context.Context, ec *etcdv1. desiredReplicas := *ec.Spec.Replicas - ec.Status.ReadyReplicas = int32(len(ownedMachines)) + // Only consider a healthy machine as a ready replica + // This will prevent an owned machine being deleted due to a catastrophic event from being considered ready. + readyReplicas := int32(0) + for _, m := range ownedMachines { + if m.healthy() { + readyReplicas++ + } + } + ec.Status.ReadyReplicas = readyReplicas if !ec.DeletionTimestamp.IsZero() { return nil } - readyReplicas := ec.Status.ReadyReplicas - if readyReplicas < desiredReplicas { conditions.MarkFalse(ec, etcdv1.EtcdClusterResizeCompleted, etcdv1.EtcdScaleUpInProgressReason, clusterv1.ConditionSeverityWarning, "Scaling up etcd cluster to %d replicas (actual %d)", desiredReplicas, readyReplicas) ec.Status.Ready = false diff --git a/controllers/status_test.go b/controllers/status_test.go index be61de9..202b187 100644 --- a/controllers/status_test.go +++ b/controllers/status_test.go @@ -60,6 +60,55 @@ func TestUpdateStatusResizeIncomplete(t *testing.T) { g.Expect(conditions.IsTrue(etcdadmCluster, etcdv1.EtcdClusterResizeCompleted)).To(BeFalse()) } +func TestUpdateStatusMachineUnhealthy(t *testing.T) { + g := NewWithT(t) + + cluster := newClusterWithExternalEtcd() + etcdadmCluster := newEtcdadmCluster(cluster) + + machine1 := newEtcdMachine(etcdadmCluster, cluster) + machine2 := newEtcdMachine(etcdadmCluster, cluster) + + etcdMachine1 := etcdMachine{ + Machine: machine1, + endpoint: "1.1.1.1", + listening: true, + healthError: nil, + } + etcdMachine2 := etcdMachine{ + Machine: machine2, + endpoint: "1.1.1.1", + listening: false, + healthError: nil, + } + + ownedMachines := map[string]etcdMachine{ + "machine1": etcdMachine1, + "machine2": etcdMachine2, + } + + objects := []client.Object{ + cluster, + etcdadmCluster, + infraTemplate.DeepCopy(), + machine1, + machine2, + } + + fakeClient := fake.NewClientBuilder().WithScheme(setupScheme()).WithObjects(objects...).Build() + + r := &EtcdadmClusterReconciler{ + Client: fakeClient, + uncachedClient: fakeClient, + Log: log.Log, + } + + err := r.updateStatus(ctx, etcdadmCluster, cluster, ownedMachines) + g.Expect(etcdadmCluster.Status.ReadyReplicas).To(Equal(int32(1))) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(conditions.IsTrue(etcdadmCluster, etcdv1.EtcdClusterResizeCompleted)).To(BeFalse()) +} + func TestUpdateStatusResizeComplete(t *testing.T) { g := NewWithT(t)