Skip to content
This repository has been archived by the owner on Oct 9, 2023. It is now read-only.

Commit

Permalink
Add ImagePullBackoffGracePeriod configuration (#370)
Browse files Browse the repository at this point in the history
* added ImagePullBackoffGracePeriod configuration

Signed-off-by: Daniel Rammer <[email protected]>

* fixed unit tests

Signed-off-by: Daniel Rammer <[email protected]>

---------

Signed-off-by: Daniel Rammer <[email protected]>
  • Loading branch information
hamersaw authored Jul 7, 2023
1 parent f7b37aa commit 3d2525a
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 6 deletions.
8 changes: 8 additions & 0 deletions go/tasks/pluginmachinery/flytek8s/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ var (
CreateContainerErrorGracePeriod: config2.Duration{
Duration: time.Minute * 3,
},
ImagePullBackoffGracePeriod: config2.Duration{
Duration: time.Minute * 3,
},
GpuResourceName: ResourceNvidiaGPU,
DefaultPodTemplateResync: config2.Duration{
Duration: 30 * time.Second,
Expand Down Expand Up @@ -132,6 +135,11 @@ type K8sPluginConfig struct {
// one, and the corresponding task marked as failed
CreateContainerErrorGracePeriod config2.Duration `json:"create-container-error-grace-period" pflag:"-,Time to wait for transient CreateContainerError errors to be resolved."`

// Time to wait for transient ImagePullBackoff errors to be resolved. If the
// error persists past this grace period, it will be inferred to be a permanent
// one, and the corresponding task marked as failed
ImagePullBackoffGracePeriod config2.Duration `json:"image-pull-backoff-grace-period" pflag:"-,Time to wait for transient ImagePullBackoff errors to be resolved."`

// The name of the GPU resource to use when the task resource requests GPUs.
GpuResourceName v1.ResourceName `json:"gpu-resource-name" pflag:"-,The name of the GPU resource to use when the task resource requests GPUs."`

Expand Down
16 changes: 13 additions & 3 deletions go/tasks/pluginmachinery/flytek8s/pod_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -521,9 +521,19 @@ func DemystifyPending(status v1.PodStatus) (pluginsCore.PhaseInfo, error) {

case "ImagePullBackOff":
t := c.LastTransitionTime.Time
return pluginsCore.PhaseInfoRetryableFailureWithCleanup(finalReason, finalMessage, &pluginsCore.TaskInfo{
OccurredAt: &t,
}), nil
if time.Since(t) >= config.GetK8sPluginConfig().ImagePullBackoffGracePeriod.Duration {
return pluginsCore.PhaseInfoRetryableFailureWithCleanup(finalReason, finalMessage, &pluginsCore.TaskInfo{
OccurredAt: &t,
}), nil
}

return pluginsCore.PhaseInfoInitializing(
t,
pluginsCore.DefaultPhaseVersion,
fmt.Sprintf("[%s]: %s", finalReason, finalMessage),
&pluginsCore.TaskInfo{OccurredAt: &t},
), nil

default:
// Since we are not checking for all error states, we may end up perpetually
// in the queued state returned at the bottom of this function, until the Pod is reaped
Expand Down
30 changes: 27 additions & 3 deletions go/tasks/pluginmachinery/flytek8s/pod_helper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,9 @@ func TestDemystifyPending(t *testing.T) {
CreateContainerErrorGracePeriod: config1.Duration{
Duration: time.Minute * 3,
},
ImagePullBackoffGracePeriod: config1.Duration{
Duration: time.Minute * 3,
},
}))

t.Run("PodNotScheduled", func(t *testing.T) {
Expand Down Expand Up @@ -689,8 +692,10 @@ func TestDemystifyPending(t *testing.T) {
assert.Equal(t, pluginsCore.PhaseInitializing, taskStatus.Phase())
})

t.Run("ImagePullBackOff", func(t *testing.T) {
s.ContainerStatuses = []v1.ContainerStatus{
t.Run("ImagePullBackOffWithinGracePeriod", func(t *testing.T) {
s2 := *s.DeepCopy()
s2.Conditions[0].LastTransitionTime = metav1.Now()
s2.ContainerStatuses = []v1.ContainerStatus{
{
Ready: false,
State: v1.ContainerState{
Expand All @@ -701,7 +706,26 @@ func TestDemystifyPending(t *testing.T) {
},
},
}
taskStatus, err := DemystifyPending(s)
taskStatus, err := DemystifyPending(s2)
assert.NoError(t, err)
assert.Equal(t, pluginsCore.PhaseInitializing, taskStatus.Phase())
})

t.Run("ImagePullBackOffOutsideGracePeriod", func(t *testing.T) {
s2 := *s.DeepCopy()
s2.Conditions[0].LastTransitionTime.Time = metav1.Now().Add(-config.GetK8sPluginConfig().ImagePullBackoffGracePeriod.Duration)
s2.ContainerStatuses = []v1.ContainerStatus{
{
Ready: false,
State: v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{
Reason: "ImagePullBackOff",
Message: "this is an error",
},
},
},
}
taskStatus, err := DemystifyPending(s2)
assert.NoError(t, err)
assert.Equal(t, pluginsCore.PhaseRetryableFailure, taskStatus.Phase())
})
Expand Down

0 comments on commit 3d2525a

Please sign in to comment.