From 4f48336830eeca5e00090fe14d4ac4abe1e5d3a6 Mon Sep 17 00:00:00 2001 From: YZ775 Date: Mon, 11 Mar 2024 00:41:19 +0000 Subject: [PATCH 1/6] change CKE to proceed rebooting immediately after draining of node is completed --- op/reboot_decide.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/op/reboot_decide.go b/op/reboot_decide.go index c72e6b88..6f8f388c 100644 --- a/op/reboot_decide.go +++ b/op/reboot_decide.go @@ -196,12 +196,10 @@ func ChooseDrainedNodes(c *cke.Cluster, apiServers map[string]bool, rqEntries [] return nil } } - if len(workerInProgress) >= maxConcurrentReboots { - return nil - } else if len(workerInProgress)+len(workerDrainable) <= maxConcurrentReboots { - return workerDrainable + if len(workerInProgress) < maxConcurrentReboots && len(workerDrainable) > 0 { + return workerDrainable[:1] } else { - return workerDrainable[:maxConcurrentReboots-len(workerInProgress)] + return nil } } From 5798f89d722852f2561ed2305a2d87063e2fecbd Mon Sep 17 00:00:00 2001 From: YZ775 Date: Thu, 18 Apr 2024 07:40:53 +0000 Subject: [PATCH 2/6] change priority of RebootRebootOp --- server/strategy.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/server/strategy.go b/server/strategy.go index c02704e5..2e8843b7 100644 --- a/server/strategy.go +++ b/server/strategy.go @@ -889,6 +889,10 @@ func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOp ops = append(ops, op.RebootCancelOp(rebootArgs.RebootCancelled)) return ops, phaseReboot } + if len(rebootArgs.DrainCompleted) > 0 { + phaseReboot = true + ops = append(ops, op.RebootRebootOp(nf.HealthyAPIServer(), rebootArgs.DrainCompleted, &c.Reboot)) + } if len(rebootArgs.NewlyDrained) > 0 { phaseReboot = true sshCheckNodes := make([]*cke.Node, 0, len(nf.cluster.Nodes)) @@ -903,10 +907,6 @@ func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOp ops = append(ops, op.RebootDrainStartOp(nf.HealthyAPIServer(), rebootArgs.NewlyDrained, &c.Reboot)) } } - if len(rebootArgs.DrainCompleted) > 0 { - phaseReboot = true - ops = append(ops, op.RebootRebootOp(nf.HealthyAPIServer(), rebootArgs.DrainCompleted, &c.Reboot)) - } if len(rebootArgs.DrainTimedout) > 0 { phaseReboot = true ops = append(ops, op.RebootDrainTimeoutOp(rebootArgs.DrainTimedout)) From 639d2973f9cc51bd0c124016d0c3374d76fd8163 Mon Sep 17 00:00:00 2001 From: YZ775 Date: Fri, 19 Apr 2024 00:52:20 +0000 Subject: [PATCH 3/6] add cancel check in drainBackOff --- op/reboot.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/op/reboot.go b/op/reboot.go index 03817c8a..d870d3ae 100644 --- a/op/reboot.go +++ b/op/reboot.go @@ -568,6 +568,13 @@ func drainBackOff(ctx context.Context, inf cke.Infrastructure, entry *cke.Reboot "name": entry.Node, log.FnError: err, }) + etcdEntry, err := inf.Storage().GetRebootsEntry(ctx, entry.Index) + if err != nil { + return err + } + if etcdEntry.Status == cke.RebootStatusCancelled { + return nil + } entry.Status = cke.RebootStatusQueued entry.LastTransitionTime = time.Now().Truncate(time.Second).UTC() entry.DrainBackOffCount++ From 609f6aa5f4255173c7f61d2cb649eaff806ff615 Mon Sep 17 00:00:00 2001 From: YZ775 Date: Tue, 23 Apr 2024 00:22:30 +0000 Subject: [PATCH 4/6] change priority of RebootDequeueOp --- server/strategy.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/server/strategy.go b/server/strategy.go index 2e8843b7..928d0b48 100644 --- a/server/strategy.go +++ b/server/strategy.go @@ -889,6 +889,11 @@ func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOp ops = append(ops, op.RebootCancelOp(rebootArgs.RebootCancelled)) return ops, phaseReboot } + if len(rebootArgs.RebootDequeued) > 0 { + phaseReboot = true + ops = append(ops, op.RebootDequeueOp(rebootArgs.RebootDequeued)) + return ops, phaseReboot + } if len(rebootArgs.DrainCompleted) > 0 { phaseReboot = true ops = append(ops, op.RebootRebootOp(nf.HealthyAPIServer(), rebootArgs.DrainCompleted, &c.Reboot)) @@ -911,10 +916,6 @@ func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOp phaseReboot = true ops = append(ops, op.RebootDrainTimeoutOp(rebootArgs.DrainTimedout)) } - if len(rebootArgs.RebootDequeued) > 0 { - phaseReboot = true - ops = append(ops, op.RebootDequeueOp(rebootArgs.RebootDequeued)) - } if len(ops) > 0 { phaseReboot = true } From 7eab07c4e5a3b5d8622e821a05688224d582a9cb Mon Sep 17 00:00:00 2001 From: YZ775 Date: Tue, 23 Apr 2024 06:24:43 +0000 Subject: [PATCH 5/6] fixup! change priority of RebootDequeueOp --- server/strategy.go | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/server/strategy.go b/server/strategy.go index 928d0b48..4ceacbef 100644 --- a/server/strategy.go +++ b/server/strategy.go @@ -885,21 +885,19 @@ func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOp } if len(rebootArgs.RebootCancelled) > 0 { - phaseReboot = true ops = append(ops, op.RebootCancelOp(rebootArgs.RebootCancelled)) - return ops, phaseReboot } if len(rebootArgs.RebootDequeued) > 0 { - phaseReboot = true ops = append(ops, op.RebootDequeueOp(rebootArgs.RebootDequeued)) - return ops, phaseReboot } + if len(ops) > 0 { + return ops, true + } + if len(rebootArgs.DrainCompleted) > 0 { - phaseReboot = true ops = append(ops, op.RebootRebootOp(nf.HealthyAPIServer(), rebootArgs.DrainCompleted, &c.Reboot)) } if len(rebootArgs.NewlyDrained) > 0 { - phaseReboot = true sshCheckNodes := make([]*cke.Node, 0, len(nf.cluster.Nodes)) for _, node := range nf.cluster.Nodes { if !rebootProcessing(rebootArgs.RQEntries, node.Address) { @@ -913,7 +911,6 @@ func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOp } } if len(rebootArgs.DrainTimedout) > 0 { - phaseReboot = true ops = append(ops, op.RebootDrainTimeoutOp(rebootArgs.DrainTimedout)) } if len(ops) > 0 { From 24224de1b7ff88454ac0d1faebce39170ca8de3c Mon Sep 17 00:00:00 2001 From: YZ775 Date: Fri, 26 Apr 2024 00:17:39 +0000 Subject: [PATCH 6/6] delete unnecessary return value phaseReboot Signed-off-by: YZ775 --- server/strategy.go | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/server/strategy.go b/server/strategy.go index 4ceacbef..bc5e5c46 100644 --- a/server/strategy.go +++ b/server/strategy.go @@ -104,7 +104,7 @@ func DecideOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constrain } // 11. Reboot nodes if reboot request has been arrived to the reboot queue, and the number of unreachable nodes is less than a threshold. - if ops, phaseReboot := rebootOps(c, constraints, rebootArgs, nf); phaseReboot { + if ops := rebootOps(c, constraints, rebootArgs, nf); len(ops) > 0 { if !nf.EtcdIsGood() { log.Warn("cannot reboot nodes because etcd cluster is not responding and in-sync", nil) return nil, cke.PhaseRebootNodes @@ -871,17 +871,17 @@ func repairOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constrain return ops, phaseRepair } -func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOpsRebootArgs, nf *NodeFilter) (ops []cke.Operator, phaseReboot bool) { +func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOpsRebootArgs, nf *NodeFilter) (ops []cke.Operator) { if len(rebootArgs.RQEntries) == 0 { - return nil, false + return nil } if len(c.Reboot.RebootCommand) == 0 { log.Warn("reboot command is not specified in the cluster configuration", nil) - return nil, false + return nil } if len(c.Reboot.BootCheckCommand) == 0 { log.Warn("boot check command is not specified in the cluster configuration", nil) - return nil, false + return nil } if len(rebootArgs.RebootCancelled) > 0 { @@ -891,7 +891,7 @@ func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOp ops = append(ops, op.RebootDequeueOp(rebootArgs.RebootDequeued)) } if len(ops) > 0 { - return ops, true + return ops } if len(rebootArgs.DrainCompleted) > 0 { @@ -913,11 +913,8 @@ func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOp if len(rebootArgs.DrainTimedout) > 0 { ops = append(ops, op.RebootDrainTimeoutOp(rebootArgs.DrainTimedout)) } - if len(ops) > 0 { - phaseReboot = true - } - return ops, phaseReboot + return ops } func rebootUncordonOp(cs *cke.ClusterStatus, rqEntries []*cke.RebootQueueEntry, nf *NodeFilter) cke.Operator {